{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps_data')  # append this repo to PYTHONPATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample, shuffle\n",
    "import math\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.annotations.add_bounding_boxes_to_megadb import *\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Importing the UBC dataset (private) to MegaDB\n",
    "\n",
    "First draft by Gramener. Siyu re-ran with verified data (~400k) on 2020 Sept 2. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_to_output = '.../data/CameraTraps/MegaDB/ubc_fennell_megadb.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'ubc_fennell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to the CCT json, or a loaded json object\n",
    "path_to_image_cct = '.../data/CameraTraps/CCT_JSONs/ubc.json'  # set to None if not available\n",
    "path_to_bbox_cct = None  # set to None if not available\n",
    "assert not (path_to_image_cct is None and path_to_bbox_cct is None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_image_cct) as f:\n",
    "    cct_json = json.load(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are back slashes in the file names in the CCT JSON - changing them to forward slash"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in cct_json['images']:\n",
    "    i['file_name'] = i['file_name'].replace('\\\\', '/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cct_json['images'][1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cct_json['annotations'][1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "## save a copy\n",
    "write_json('.../data/CameraTraps/CCT_JSONs/ubc_fennell_cct.json', cct_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading image DB...\n",
      "Number of items from the image DB: 441483\n",
      "Number of images with more than 1 species: 49178 (11.14% of image DB)\n",
      "No bbox DB provided.\n"
     ]
    }
   ],
   "source": [
    "# pass in the updated CCT JSON\n",
    "embedded = make_cct_embedded(image_db=cct_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequences = process_sequences(embedded, dataset_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have no sequence information but it seems that the annotation was done at the sequence level... Moving various properties back to the image level so that it makes more sense for future queries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "for seq in sequences:\n",
    "    assert len(seq['images']) == 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i_seq, seq in enumerate(sequences):\n",
    "    del seq['id']  # seems to be exactly the same as file path to the only image\n",
    "    \n",
    "    for prop in ['species_count', 'datetime', 'class']:\n",
    "        if prop in seq:\n",
    "            seq['images'][0][prop] = seq[prop]\n",
    "            del seq[prop]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dumps(sample(sequences, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n"
     ]
    }
   ],
   "source": [
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_json(path_to_output, sequences)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train/val/test splits\n",
    "in 70:15:15, as there are lots of vehicles in this dataset and we need more vehicles/humans in our val/test splits to better evaluate megadetector."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "for seq in sequences:\n",
    "    assert 'location' in seq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "197"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations = set()\n",
    "for seq in sequences:\n",
    "    locations.add(seq['location'])\n",
    "    \n",
    "len(locations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "first loc before shuffling is South Chilcotins Wildlife Survey 2018_F7\n",
      "first loc after shuffling is South Chilcotins Wildlife Survey 2018_L9\n"
     ]
    }
   ],
   "source": [
    "li_locations = list(locations)\n",
    "print(f'first loc before shuffling is {li_locations[0]}')\n",
    "shuffle(li_locations)\n",
    "print(f'first loc after shuffling is {li_locations[0]}')\n",
    "\n",
    "num_train = round(0.7 * len(locations))\n",
    "num_val = round(0.15 * len(locations))\n",
    "\n",
    "locs_train = li_locations[:num_train]\n",
    "locs_val = li_locations[num_train:num_train + num_val]\n",
    "locs_test = li_locations[num_train + num_val:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "splits_table = [\n",
    "    {\n",
    "        'dataset': dataset_name,\n",
    "        'train': locs_train,\n",
    "        'val': locs_val,\n",
    "        'test': locs_test\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('.../CameraTrap/Databases/megadb_2020/ubc_fennell_splits.json', 'w') as f:\n",
    "    json.dump(splits_table, f, indent=4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
